In [45]:
import pandas as pd
import numpy as np
import time

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,MinMaxScaler  # doctest: +SKIP
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.random_projection import SparseRandomProjection
from sklearn.random_projection import GaussianRandomProjection

from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import learning_curve
from silhouette import silhouette
import matplotlib.cm as cm
from sklearn.decomposition import FastICA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from scipy.stats import norm, kurtosis



%matplotlib inline
In [2]:
X = pd.read_pickle('../homework1/X.pkl')
y = pd.read_pickle('../homework1/y.pkl')
income = pd.read_csv('../homework1/adult.csv')
In [3]:
country_counts = income['native.country'].value_counts()
country_counts_df = country_counts.to_frame(name='counts')
small = list(country_counts_df[country_counts_df.counts<100].index)
income_modified = income.copy()
income_modified.loc[income['native.country'].apply(lambda x:x in small+['?']),['native.country']] = 'other'
income_modified.loc[income_modified.workclass.apply(lambda x:x in ['?']),['occupation','workclass']] = ('other','other')

income_modified.drop(['fnlwgt','education','income'],axis=1,inplace=True)

le = preprocessing.LabelEncoder()

for item in income_modified.columns[income_modified.dtypes==object]:
    income_modified[item] = le.fit_transform(income_modified[item])

income_modified_stan = StandardScaler().fit_transform(income_modified)
pos = income_modified_stan[(income['capital.gain']>0)|(income['capital.gain']>0)]
In [4]:
X_stan = StandardScaler().fit_transform(X)
In [12]:
kmeans = KMeans(n_clusters=9, random_state=0)
kmeans.fit(X_stan)
Out[12]:
KMeans(n_clusters=9, random_state=0)
In [27]:
pd.DataFrame(kmeans.labels_,columns=['labels']).labels.value_counts()
Out[27]:
1    28460
6     1485
0      752
5      635
7      520
3      493
2      159
4       57
Name: labels, dtype: int64
In [97]:
pd.DataFrame(kmeans.labels_,columns=['labels']).labels.value_counts()
Out[97]:
4    10507
2     9509
0     6470
5     1826
7     1569
1     1361
3     1114
6      198
8        7
Name: labels, dtype: int64
In [15]:
sum(kmeans.labels_)
Out[15]:
159
In [17]:
kmeans.labels_.var()
Out[17]:
0.004859297329838416
In [34]:
kmeans.cluster_centers_[0].shape
Out[34]:
(59,)
In [35]:
X.shape
Out[35]:
(32561, 59)
In [ ]:
 
In [ ]:
 
In [7]:
from sklearn.mixture import GaussianMixture
co_type = ['full','diag','tied','spherical']
gm_co_scores = []
aics = []
for t in co_type:
    gm = GaussianMixture(n_components=9, random_state=10,covariance_type=t)
    gm_co = gm.fit_predict(X_stan)
    gm_co_scores.append(silhouette_score(X_stan,gm_co))
    aics.append(gm.aic(X_stan))
In [ ]:
 
In [10]:
plt.title('EM Covariance Type Performance')
plt.xlabel('Covariance Type')
plt.ylabel('AIC Score')
plt.xticks(range(4),co_type)
#plt.plot(gm_co_scores)
plt.plot(aics)
Out[10]:
[<matplotlib.lines.Line2D at 0x155bbb59d88>]
In [321]:
gm_co_scores
Out[321]:
[0.10546634557682896,
 0.10493767963874576,
 0.1086937568714917,
 -0.02597774179006021]
In [ ]:
 
In [99]:
pd.DataFrame(gm.fit_predict(X_stan),columns=['label']).label.value_counts()
Out[99]:
2    11655
1     7049
5     5117
3     3148
4     2487
6     1986
0      914
8      198
7        7
Name: label, dtype: int64
In [100]:
pd.DataFrame(kmeans.labels_,columns=['labels']).labels.value_counts()
Out[100]:
4    10507
2     9509
0     6470
5     1826
7     1569
1     1361
3     1114
6      198
8        7
Name: labels, dtype: int64
In [101]:
X.head()
Out[101]:
age education.num capital.gain capital.loss hours.per.week workclass_Federal-gov workclass_Local-gov workclass_Never-worked workclass_Private workclass_Self-emp-inc ... sex_Male native.country_Canada native.country_El-Salvador native.country_Germany native.country_India native.country_Mexico native.country_Philippines native.country_Puerto-Rico native.country_United-States native.country_other
0 90 9 0 4356 40 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 82 9 0 4356 18 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0
2 66 10 0 4356 40 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 54 4 0 3900 40 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0
4 41 10 0 3900 40 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0

5 rows × 59 columns

In [108]:
kmeans.labels_[[0,2]]
Out[108]:
array([5, 5])
In [ ]:
 
In [113]:
pd.DataFrame(kmeans.labels_[X['native.country_Canada']==1])[0].value_counts()
Out[113]:
4    38
2    32
0    21
5    14
7     8
3     6
1     2
Name: 0, dtype: int64
In [115]:
pd.DataFrame(kmeans.labels_[X['native.country_other']==1])[0].value_counts()
Out[115]:
4    631
2    584
0    419
5    100
3     95
1     84
7     59
Name: 0, dtype: int64
In [8]:
gm.bic(X_stan)
Out[8]:
-5461387.248276959
In [19]:
ks = [2,4,6,8,15,20,200,2000]
In [ ]:
 
In [260]:
import time

d = X_stan

start = time.time()
ks = [2,4,6,8,10]
total = []
scores = []
for k in ks:
    print(k)
    gms = GaussianMixture(n_components=k, random_state=10)
    gms.fit(d)
    total.append(gms.aic(d))
    cluster_labels = gms.fit_predict(d)
    scores.append(silhouette_score(d, cluster_labels))

    print(time.time()-start)
plt.xticks(range(0,5), ks)
plt.title('Choose Cluster for EM')
plt.ylabel('AIC')
plt.plot(total)
2
19.013328790664673
4
42.743611097335815
6
78.12048959732056
8
150.5247838497162
10
205.06795930862427
Out[260]:
[<matplotlib.lines.Line2D at 0x19c9cfc4e88>]
In [37]:
import numpy as np
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2).fit_transform(X)
X_embedded.shape
Out[37]:
(32561, 2)
In [40]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
n_neighbors=2

fig = plt.figure(figsize=(15, 8))
fig.suptitle("Manifold Learning with %i points, %i neighbors"
             % (1000, n_neighbors), fontsize=14)
Out[40]:
Text(0.5,0.98,'Manifold Learning with 1000 points, 2 neighbors')
<Figure size 1080x576 with 0 Axes>
In [48]:
from sklearn import manifold, datasets

n_points = 1000
X1, color = datasets.make_s_curve(32561, random_state=0)
In [49]:
color
Out[49]:
array([ 0.46005644,  2.028112  ,  0.968522  , ...,  1.21266926,
       -1.72622701,  3.92248014])
In [53]:
i=0
ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
ax.scatter(X_embedded[:, 0], X_embedded[:, 1], c=color, cmap=plt.cm.Spectral)
#ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
ax.axis('tight')
Out[53]:
(-96.01885854184566, 94.52276479184572, -96.44877498336895, 96.06880061813457)
In [52]:
X_embedded[:, 0]
Out[52]:
array([ 13.125074,  13.12485 ,  13.125004, ...,  35.173664,  50.48359 ,
       -35.992203], dtype=float32)
In [54]:
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=color, cmap=plt.cm.Spectral)
Out[54]:
<matplotlib.collections.PathCollection at 0x23badf63630>
In [56]:
X_embedded.shape
Out[56]:
(32561, 2)
In [57]:
X_embedded[:10]
Out[57]:
array([[ 13.125074, -74.85138 ],
       [ 13.12485 , -74.8513  ],
       [ 13.125004, -74.8514  ],
       [ 13.147255, -74.8633  ],
       [ 13.147282, -74.86335 ],
       [ 13.158456, -74.87092 ],
       [ 13.158326, -74.87084 ],
       [ 13.169389, -74.87893 ],
       [ 13.169538, -74.879005],
       [ 14.169823, -75.68437 ]], dtype=float32)
In [46]:
income = pd.read_csv('../homework1/adult.csv')
In [60]:
income.na
Out[60]:
Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')
In [68]:
gm.get_params()
Out[68]:
{'covariance_type': 'full',
 'init_params': 'kmeans',
 'max_iter': 100,
 'means_init': None,
 'n_components': 8,
 'n_init': 1,
 'precisions_init': None,
 'random_state': None,
 'reg_covar': 1e-06,
 'tol': 0.001,
 'verbose': 0,
 'verbose_interval': 10,
 'warm_start': False,
 'weights_init': None}
In [ ]:
 
In [50]:
X_standard = X_stan.copy()
In [27]:
import matplotlib.cm as cm

range_n_clusters = [2, 4,  6,8]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X_stan) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X_stan)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X_stan, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X_stan, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X_stan[:, 0], X_stan[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()
For n_clusters = 2 The average silhouette_score is : 0.08171694863868836
For n_clusters = 4 The average silhouette_score is : 0.0858816062531556
For n_clusters = 6 The average silhouette_score is : 0.09556409402257389
For n_clusters = 8 The average silhouette_score is : 0.07691835098548624
In [466]:
silhouette(X_stan,columns=[3,4])
For n_clusters = 2 The average silhouette_score is : 0.08171694863868836
For n_clusters = 4 The average silhouette_score is : 0.0858816062531556
For n_clusters = 6 The average silhouette_score is : 0.09556409402257389
For n_clusters = 8 The average silhouette_score is : 0.07691835098548624
In [88]:
silhouette(pos,columns=[3,4])
1
2
For n_clusters = 2 The average silhouette_score is : 0.6866712759041652
1
2
For n_clusters = 4 The average silhouette_score is : 0.15488300271215696
1
2
For n_clusters = 6 The average silhouette_score is : 0.16887262333301237
1
2
For n_clusters = 8 The average silhouette_score is : 0.168037432132915
In [56]:
silhouette(income_modified_stan)
1
2
For n_clusters = 2 The average silhouette_score is : 0.1652386576016733
1
2
For n_clusters = 4 The average silhouette_score is : 0.17492975757250373
1
2
For n_clusters = 6 The average silhouette_score is : 0.15894155185409176
1
2
For n_clusters = 8 The average silhouette_score is : 0.18507338932677941
In [75]:
x_np = X.to_numpy()
x_np = x_np[(X['capital.gain']>0)|(X['capital.loss'])>0]
In [76]:
silhouette(x_np,range_n_clusters=[2,4,6,8])#,columns=[3,4])
For n_clusters = 2 The average silhouette_score is : 0.9372089629808373
For n_clusters = 4 The average silhouette_score is : 0.667820929451939
For n_clusters = 6 The average silhouette_score is : 0.7705785011270774
For n_clusters = 8 The average silhouette_score is : 0.7692352273437891
In [78]:
silhouette(x_np,range_n_clusters=[2,4,6,8],columns=[3,4])
For n_clusters = 2 The average silhouette_score is : 0.9372089629808373
For n_clusters = 4 The average silhouette_score is : 0.667820929451939
For n_clusters = 6 The average silhouette_score is : 0.7705785011270774
For n_clusters = 8 The average silhouette_score is : 0.7692352273437891
In [57]:
income_modified_stan[0]
Out[57]:
array([ 3.76961234,  2.92262342, -0.42005962,  2.24948009,  1.75152457,
       -0.27780504,  0.39366753, -1.42233076, -0.14592048, 10.59350656,
       -0.03542945,  0.11053875])
In [58]:
income_modified.head()
Out[58]:
age workclass education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country
0 90 8 9 6 15 1 4 0 0 4356 40 7
1 82 3 9 6 4 1 4 0 0 4356 18 7
2 66 8 10 6 15 4 2 0 0 4356 40 7
3 54 3 4 0 7 4 4 0 0 3900 40 7
4 41 3 10 5 10 3 4 0 0 3900 40 7
In [ ]:
import time
start = time.time()
ks = [20,200,500,1000,2000,2500]
total = []
for k in ks:
    gms = GaussianMixture(n_components=k, random_state=10)
    gms.fit(income_modified_stan)
    total.append(gms.aic(income_modified_stan))
print(time.time()-start)
In [68]:
pos = income_modified_stan[(income['capital.gain']>0)|(income['capital.gain']>0)]
In [16]:
silhouette(income_modified_stan,range_n_clusters=[100,300,500],columns=[3,4])
For n_clusters = 100 The average silhouette_score is : 0.17025050375575018
For n_clusters = 300 The average silhouette_score is : 0.18819617151341902
For n_clusters = 500 The average silhouette_score is : 0.1899881677977891
In [70]:
income_modified.head()
Out[70]:
age workclass education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country
0 90 8 9 6 15 1 4 0 0 4356 40 7
1 82 3 9 6 4 1 4 0 0 4356 18 7
2 66 8 10 6 15 4 2 0 0 4356 40 7
3 54 3 4 0 7 4 4 0 0 3900 40 7
4 41 3 10 5 10 3 4 0 0 3900 40 7
In [78]:
colors = cm.nipy_spectral(1 / 8)
In [85]:
income_modified.iloc[:,2].shape
Out[85]:
(32561,)

running with 6 clusters

In [28]:
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans_label = kmeans.fit_predict(X_stan)
In [67]:
max(gm_label)
Out[67]:
5
In [66]:
kmeans = KMeans(n_clusters=6, random_state=0)
kmeans_label = kmeans.fit_predict(X_stan)
gm = GaussianMixture(n_components=6, random_state=0)
gm_label = gm.fit_predict(X_stan)
In [29]:
gm = GaussianMixture(n_components=6, random_state=0)
gm_label = gm.fit_predict(X_stan)
In [62]:
from sklearn.metrics.cluster import v_measure_score
from sklearn.metrics.cluster import adjusted_mutual_info_score

print(v_measure_score(gm_label,y),
v_measure_score(kmeans_label,y))
0.06047935403905461 0.059698320657788344
In [63]:
from sklearn.metrics.cluster import v_measure_score
print(v_measure_score(gm_label,y),
adjusted_mutual_info_score(kmeans_label,y))
0.06047935403905461 0.05930603971652433
In [65]:
from sklearn.metrics.cluster import v_measure_score
print(v_measure_score(gm_label,y),
adjusted_mutual_info_score(kmeans_label,y))
0.13182142925581544 0.13184476413071444
In [58]:
print(v_measure_score(aa.ori_gm,y),v_measure_score(aa.ori_kmeans,y))
0.05924320054944691 0.05876438791065997
In [82]:
X.head()
Out[82]:
age education.num capital.gain capital.loss hours.per.week workclass_Federal-gov workclass_Local-gov workclass_Never-worked workclass_Private workclass_Self-emp-inc ... sex_Male native.country_Canada native.country_El-Salvador native.country_Germany native.country_India native.country_Mexico native.country_Philippines native.country_Puerto-Rico native.country_United-States native.country_other
0 90 9 0 4356 40 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 82 9 0 4356 18 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0
2 66 10 0 4356 40 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 54 4 0 3900 40 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0
4 41 10 0 3900 40 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0

5 rows × 59 columns

In [30]:
results = pd.concat([income_modified,pd.DataFrame(kmeans_label,columns=['kmeans']),pd.DataFrame(gm_label,columns=['EM'])],
                    axis=1)
In [125]:
rr = pd.DataFrame(np.stack((y.to_numpy(),kmeans_label,gm_label)).T,columns = ['label','kmeans','EM'])
rr.head()
Out[125]:
label kmeans EM
0 0 4 0
1 0 0 0
2 0 4 0
3 0 0 3
4 0 0 2
In [127]:
emdf = []
for i in range(6):
    emdf.append(rr[rr.EM==i].label.value_counts())
In [128]:
emdf = pd.DataFrame(emdf,index=range(6)).T
In [132]:
emdf.index = ['EM0','EM1']
In [133]:
emdf
Out[133]:
0 1 2 3 4 5
EM0 4561.0 5815.0 5976.0 1752.0 6609.0 7.0
EM1 544.0 415.0 935.0 250.0 5697.0 NaN
In [109]:
for i in range(6):
    print(rr[rr.EM==i].label.value_counts())
0    4561
1     544
Name: label, dtype: int64
0    5815
1     415
Name: label, dtype: int64
0    5976
1     935
Name: label, dtype: int64
0    1752
1     250
Name: label, dtype: int64
0    6609
1    5697
Name: label, dtype: int64
0    7
Name: label, dtype: int64
In [135]:
kmeans_df = []
for i in range(6):
    kmeans_df.append(rr[rr.kmeans==i].label.value_counts())
kmeans_df = pd.DataFrame(kmeans_df,index=range(6)).T
kmeans_df.index = ['Kmeans0','Kmeans1']
kmeans_df
Out[135]:
0 1 2 3 4 5
Kmeans0 16037.0 6960.0 7.0 13.0 1643.0 60.0
Kmeans1 1817.0 5783.0 NaN 10.0 191.0 40.0
In [137]:
pd.concat([emdf,kmeans_df]).to_csv('raw.csv')
In [130]:
for i in range(6):
    print(rr[rr.kmeans==i].label.value_counts())
0    16037
1     1817
Name: label, dtype: int64
0    6960
1    5783
Name: label, dtype: int64
0    7
Name: label, dtype: int64
0    13
1    10
Name: label, dtype: int64
0    1643
1     191
Name: label, dtype: int64
0    60
1    40
Name: label, dtype: int64
In [133]:
rr[rr.kmeans==1].EM.value_counts()
Out[133]:
4    11750
3      893
1       91
2        5
0        4
Name: EM, dtype: int64
In [134]:
rr[rr.kmeans==0].EM.value_counts()
Out[134]:
2    6297
1    5690
0    4760
3    1107
Name: EM, dtype: int64
In [139]:
rr[rr.kmeans==4].EM.value_counts()
Out[139]:
2    587
4    489
1    421
0    337
Name: EM, dtype: int64
In [138]:
income.loc[rr[rr.EM==5].index,]
Out[138]:
age workclass fnlwgt education education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country income
8874 18 Never-worked 206359 10th 6 Never-married ? Own-child White Male 0 0 40 United-States <=50K
13675 23 Never-worked 188535 7th-8th 4 Divorced ? Not-in-family White Male 0 0 35 United-States <=50K
17089 17 Never-worked 237272 10th 6 Never-married ? Own-child White Male 0 0 30 United-States <=50K
21934 18 Never-worked 157131 11th 7 Never-married ? Own-child White Female 0 0 10 United-States <=50K
24483 20 Never-worked 462294 Some-college 10 Never-married ? Own-child Black Male 0 0 40 United-States <=50K
32331 30 Never-worked 176673 HS-grad 9 Married-civ-spouse ? Wife Black Female 0 0 40 United-States <=50K
32338 18 Never-worked 153663 Some-college 10 Never-married ? Own-child White Male 0 0 4 United-States <=50K
In [452]:
#fig, (ax1, ax2) = plt.subplots(1, 2)
combined_sort_label = []
columns = aa.columns[:-1]
i = 0
for col in columns:
    i+=1
    bar_x = range(max(aa[col])+1)
    bar_y = aa[col].value_counts().to_numpy()
    combined_sort_label.append(bar_y)
    plt.bar(bar_x, bar_y)
    #plt.xticks(range(max(aa[col])+1),aa[col].value_counts().index)
    if i==2:
        i=0
        plt.title(col[:4]+' GM and Kmeans')
        plt.xlabel('cluster group')
        plt.legend(('GM', 'Kmeans'))

        plt.show()
In [ ]:
 

feature reduction

In [10]:
X_stan.shape
Out[10]:
(32561, 59)
In [16]:
pca = PCA(random_state=0)
pca.fit(X_stan)
Out[16]:
PCA(random_state=0)
In [17]:
pca.explained_variance_ratio_
Out[17]:
array([7.53427746e-02, 4.37346902e-02, 4.24290734e-02, 3.96862110e-02,
       3.39571801e-02, 3.29785398e-02, 2.96613906e-02, 2.89575508e-02,
       2.52645301e-02, 2.32006676e-02, 2.18888009e-02, 2.09451567e-02,
       2.07136420e-02, 2.00950594e-02, 1.93836191e-02, 1.91661220e-02,
       1.90017949e-02, 1.84804957e-02, 1.84074100e-02, 1.81361553e-02,
       1.80612055e-02, 1.78765136e-02, 1.76003002e-02, 1.75910907e-02,
       1.73689262e-02, 1.72844472e-02, 1.71773878e-02, 1.70145587e-02,
       1.69646289e-02, 1.68864778e-02, 1.68173817e-02, 1.67236763e-02,
       1.65425623e-02, 1.64718907e-02, 1.62782520e-02, 1.61942708e-02,
       1.56301289e-02, 1.52680526e-02, 1.50630799e-02, 1.47975008e-02,
       1.44890508e-02, 1.35248408e-02, 1.23452996e-02, 1.14305419e-02,
       1.04511918e-02, 9.27134532e-03, 8.29304947e-03, 7.65266716e-03,
       7.15250758e-03, 3.46308983e-04, 1.48519021e-32, 5.20513739e-33,
       3.25692742e-33, 2.69228214e-33, 1.91443032e-33, 1.85755526e-33,
       1.00670126e-33, 4.44513653e-34, 4.91320576e-35])
In [19]:
pca.explained_variance_ratio_[:-5].sum()
Out[19]:
1.0
In [30]:
pca.explained_variance_ratio_[:34].sum()
Out[30]:
0.8118119117024294
In [252]:
len(pca.explained_variance_ratio_)
Out[252]:
34
In [251]:
 
Out[251]:
0.8105928290769348
In [25]:
pca.explained_variance_ratio_.size
Out[25]:
59
In [33]:
pca = PCA(n_components=34,random_state=0)
pca.fit(X_stan)
pca.explained_variance_ratio_#.sum()
Out[33]:
array([0.07534277, 0.04373461, 0.04242891, 0.03968601, 0.03395703,
       0.03297826, 0.02965915, 0.02895524, 0.0252638 , 0.02319876,
       0.02188494, 0.02093456, 0.02071117, 0.02009266, 0.0193609 ,
       0.0191336 , 0.0189833 , 0.01846592, 0.01829135, 0.01812522,
       0.01804467, 0.01785701, 0.01758391, 0.01749469, 0.01733701,
       0.01718808, 0.01714083, 0.0169632 , 0.01685763, 0.01680829,
       0.01676054, 0.01657578, 0.01653199, 0.01626101])
In [48]:
import seaborn as sns

def plot_pca(vector_num,top_comp):
    v_1 = pca.components_[vector_num]
    comps = pd.DataFrame(list(zip(v_1, X.columns.values)), 
                             columns=['weights', 'features'])
    comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
    sorted_weight_data = comps.sort_values('abs_weights', ascending=False).head(top_comp)
    ax=plt.subplots(figsize=(10,6))
    ax=sns.barplot(data=sorted_weight_data, 
                   x="weights", 
                   y="features", 
                   palette="Blues_d")
    ax.set_title("PCA Component Makeup, Component #" + str(vector_num))
    plt.show()
In [279]:
pca_X = pca.transform(X_stan)
In [282]:
len(pca_X)
Out[282]:
32561
In [52]:
plot_pca(1,5)
In [51]:
plot_pca(2,5)

ICA

In [54]:
transformer = FastICA(n_components=7,
        random_state=0)
X_transformed = transformer.fit_transform(X)
X_transformed.shape
Out[54]:
(32561, 7)
In [ ]:
 
In [55]:
from scipy.stats import norm, kurtosis
kurtosis(X_transformed)
Out[55]:
array([ 8.18760249e-01, -1.01196094e+00, -1.49941043e+00,  3.36974946e+00,
        1.55709093e+02,  1.86691347e-02,  2.03706755e+01])
In [60]:
len(transformer.components_[0])
Out[60]:
59
In [64]:
kurtosis(X_transformed.flatten())
Out[64]:
25.396510863393672
In [65]:
ks = [2,4,6,8,15,20,200,2000]
transformerlarge = FastICA(n_components=70,
        random_state=0)
X_transformed_large = transformerlarge.fit_transform(X_stan)
X_transformed_large.shape
C:\Users\zhy89\Anaconda3\lib\site-packages\sklearn\decomposition\_fastica.py:470: UserWarning: n_components is too large: it will be set to 59
  % n_components
C:\Users\zhy89\Anaconda3\lib\site-packages\sklearn\decomposition\_fastica.py:120: ConvergenceWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
  ConvergenceWarning)
Out[65]:
(32561, 59)
In [91]:
ks = [1,2,3,4,5,6,7,8,9,10]#,20,40]
def find_ica(ks,X_stan):
    results = []
    for k in ks:
        transformerlarge = FastICA(n_components=k,
                random_state=0)
        X_transformed_large = transformerlarge.fit_transform(X_stan)
        results.append(kurtosis(X_transformed_large))#.mean()
    return results
In [287]:
ks = [2,3,5,10,40,50]
results = find_ica(ks,X_stan)
rs = []
for r in results:
    rs.append(r.mean())
plt.title('Number of Components for ICA ')
plt.xlabel('Number of Components')
plt.ylabel('Kurtosis')
plt.xticks(range(6),ks)
plt.plot(rs)
Out[287]:
[<matplotlib.lines.Line2D at 0x1c020545080>]
In [95]:
rs = []
for r in results:
    rs.append(r.mean())
In [209]:
        transformerlarge = FastICA(n_components=5,
                random_state=0)
        X_transformed_large = transformerlarge.fit_transform(X_stan)
In [211]:
len(transformerlarge.components_)
Out[211]:
5
In [283]:
ica_X = transformerlarge.transform(X_stan)
In [ ]:
 
In [212]:
import seaborn as sns

def plot_ica(vector_num,top_comp,transformerlarge=transformerlarge):
    v_1 = transformerlarge.components_[vector_num]
    comps = pd.DataFrame(list(zip(v_1, X.columns.values)), 
                             columns=['weights', 'features'])
    comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
    sorted_weight_data = comps.sort_values('abs_weights', ascending=False).head(top_comp)
    ax=plt.subplots(figsize=(10,6))
    ax=sns.barplot(data=sorted_weight_data, 
                   x="weights", 
                   y="features", 
                   palette="Blues_d")
    ax.set_title("ICA Component Makeup, Component #" + str(vector_num))
    plt.show()
In [213]:
plot_ica(0,5)
In [214]:
plot_ica(1,5)

random projection

In [106]:
from sklearn.random_projection import SparseRandomProjection
from sklearn.random_projection import GaussianRandomProjection

rng = np.random.RandomState(42)

gauRP = GaussianRandomProjection(random_state=rng,n_components=5)
SRRp = SparseRandomProjection(random_state=rng,n_components=5)

gau = gauRP.fit_transform(X_stan)
sr = SRRp.fit_transform(X_stan)
In [99]:
transformer = SparseRandomProjection(random_state=rng)
In [100]:
rng = np.random.RandomState(42)
In [226]:
np.dot(gau,gauRP.components_)
Out[226]:
array([[ 1.52728033,  0.39146801, -2.20679248, ...,  4.87810057,
         2.88486553, -0.72385374],
       [ 1.59082937, -0.26242921, -3.84174783, ...,  8.3801579 ,
         2.17716806, -0.63822585],
       [ 2.78571644,  4.54053743, -1.21003996, ...,  8.05376722,
         0.93610003,  0.86619842],
       ...,
       [ 0.50904389, -1.9558814 , -2.03567469, ...,  1.76701068,
        -0.17498889,  1.19058791],
       [ 2.98068916,  0.87754497,  0.51635066, ...,  2.58104647,
        -1.66522965,  2.53961078],
       [-0.4156675 ,  2.06327371,  0.89178231, ...,  1.30649261,
         1.66691293, -0.20532557]])
In [227]:
from sklearn.metrics import mean_squared_error
mean_squared_error(np.dot(gau,gauRP.components_), X_stan)
Out[227]:
7.6861539865946975
In [273]:
ks = [10,20,30,35,45,50,59]#,20,40]
#ks = [5]
def find_rand_proj(ks=ks,X_stan=X_stan):
    results = []
    for k in ks:
        error = 0
        for i in range(5):
            gauRP = GaussianRandomProjection(random_state=i,n_components=k)
            SRRp = SparseRandomProjection(random_state=i,n_components=k)

            gau = SRRp.fit_transform(X_stan)

            X_transformed_large = SRRp.fit_transform(X_stan)
            error += mean_squared_error(np.dot(gau,SRRp.components_.toarray()), X_stan)
        results.append(error/k)
    return results
In [274]:
randp_result = find_rand_proj()
In [242]:
ks = [10,20,30,35,45,50,59]#,20,40]
#ks = [5]
def find_rand_proj(ks=ks,X_stan=X_stan):
    results = []
    for k in ks:
        error = 0
        for i in range(5):
            gauRP = GaussianRandomProjection(random_state=i,n_components=k)
            #SRRp = SparseRandomProjection(random_state=rng,n_components=5)

            gau = gauRP.fit_transform(X_stan)
            
            X_transformed_large = gauRP.fit_transform(X_stan)
            error += mean_squared_error(np.dot(gau,gauRP.components_), X_stan)
        results.append(error/k)
    return results
In [246]:
plt.title('Choosing Number of Components for Randomized Projection')
plt.xlabel('number of clusters')
plt.ylabel('MSE')
plt.xticks(range(len(ks)),ks)
plt.plot(randp_result)
Out[246]:
[<matplotlib.lines.Line2D at 0x1c24d66a860>]
In [275]:
plt.title('Choosing Number of Components for SRR Randomized Projection')
plt.xlabel('number of clusters')
plt.ylabel('MSE')
plt.xticks(range(len(ks)),ks)
plt.plot(randp_result)
Out[275]:
[<matplotlib.lines.Line2D at 0x1c01567ada0>]
In [ ]:
 
In [245]:
randp_result
Out[245]:
[3.1037082540206162,
 0.724347844481368,
 0.3312539985241669,
 0.23534256678913565,
 0.13837032330372603,
 0.11007636931582258,
 0.08165435456456861]
In [276]:
randp_result
Out[276]:
[3.0087145643741318,
 0.7751360392073703,
 0.3226386187619275,
 0.24933547853078197,
 0.15348013559902052,
 0.12112446889172565,
 0.09254018814331817]

choose 45 as the final cluster

In [340]:
gauRP = GaussianRandomProjection(random_state=0,n_components=45)
rp_X = gauRP.fit_transform(X_stan)
In [ ]:
 

decision tree feature selection

In [142]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_stan, y)
Out[142]:
RandomForestClassifier(random_state=42)
In [ ]:
 
In [315]:
X.columns[tree_importance_sorted_idx[:15]][:15]
Out[315]:
Index(['occupation_Never-worked', 'workclass_Never-worked',
       'occupation_Armed-Forces', 'workclass_Without-pay',
       'occupation_Priv-house-serv', 'native.country_El-Salvador',
       'marital.status_Married-AF-spouse', 'native.country_Puerto-Rico',
       'native.country_India', 'marital.status_Married-spouse-absent',
       'race_Other', 'native.country_Philippines', 'native.country_Canada',
       'native.country_Germany', 'race_Amer-Indian-Eskimo'],
      dtype='object')
In [324]:
tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_importance_sorted_idx
Out[324]:
array([28,  7, 22, 12, 30, 51, 15, 56, 53, 17, 46, 55, 50, 52, 43, 39, 13,
       19, 36, 20, 54, 32, 26, 44, 41, 27, 11, 35, 25, 58, 34, 45, 21,  5,
        6, 14, 57,  9, 47, 23, 33, 29, 49, 48, 10, 42, 40,  8, 38, 31, 24,
       18,  3, 37, 16,  4,  2,  1,  0], dtype=int64)
In [ ]:
 
In [319]:
clf.feature_importances_[tree_importance_sorted_idx[:15]]
Out[319]:
array([2.08516315e-06, 3.26302015e-06, 2.69921975e-05, 7.87106504e-05,
       1.54020590e-04, 2.40517415e-04, 4.07483603e-04, 5.71193510e-04,
       7.02471104e-04, 9.73583892e-04, 9.88767180e-04, 1.02891026e-03,
       1.19816553e-03, 1.23068690e-03, 1.59090429e-03])
In [147]:
clf.feature_importances_[tree_importance_sorted_idx[-15:]].sum()
Out[147]:
0.854011328619174
In [233]:
clf.feature_importances_[tree_importance_sorted_idx[-15:]].sum()
Out[233]:
0.854011328619174
In [235]:
# result = permutation_importance(clf, X_stan, y, n_repeats=1,
#                                 random_state=42)
# perm_sorted_idx = result.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5

#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))

ax1.barh(tree_indices[-5:],
         clf.feature_importances_[tree_importance_sorted_idx[-5:]], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx][-5:])
ax1.set_yticks(tree_indices[-5:])
#ax1.set_ylim((0, len(clf.feature_importances_)))
# ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
#             labels=X.feature_names[perm_sorted_idx])
#fig.tight_layout()
plt.title('Tree Importance')
plt.show()
In [342]:
tree_X = X_stan[:,tree_importance_sorted_idx[-15:]]
In [337]:
X.iloc[0:3,tree_importance_sorted_idx[-15:]]
Out[337]:
workclass_Self-emp-not-inc relationship_Wife relationship_Own-child workclass_Private relationship_Not-in-family occupation_Prof-specialty occupation_Exec-managerial marital.status_Never-married capital.loss relationship_Husband marital.status_Married-civ-spouse hours.per.week capital.gain education.num age
0 0 0 0 0 1 0 0 0 4356 0 0 40 0 9 90
1 0 0 0 1 1 0 1 0 4356 0 0 18 0 9 82
2 0 0 0 0 0 0 0 0 4356 0 0 40 0 10 66
In [309]:
# result = permutation_importance(clf, X_stan, y, n_repeats=1,
#                                 random_state=42)
# perm_sorted_idx = result.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5

#fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))

ax1.barh(tree_indices,
         clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf.feature_importances_)))
# ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
#             labels=X.feature_names[perm_sorted_idx])
#fig.tight_layout()
plt.title('Tree Importance')
plt.show()

check running time

In [308]:
t2
Out[308]:
0.27495813369750977
In [312]:
#start = time.time()

transformer = FastICA(n_components=10,
        random_state=0)
pca = PCA(n_components=10,random_state=0)
gauRP = GaussianRandomProjection(random_state=0,n_components=10)
t0 = time.time()
gauRP.fit_transform(X_stan)
t1 = time.time()#-start
transformer.fit_transform(X_stan)
t2 = time.time()#-t1
pca.fit_transform(X_stan)
t3 = time.time()#-t2
print(t1-t0,t2-t1,t3-t2)
0.014001131057739258 0.24799776077270508 0.15600299835205078
In [311]:
time.time()
Out[311]:
1604185393.164158

clustering again with feature reduction

In [344]:
ica_X.shape
Out[344]:
(32561, 5)
In [345]:
pca_X.shape
Out[345]:
(32561, 34)
In [346]:
rp_X.shape
Out[346]:
(32561, 45)
In [347]:
tree_X.shape
Out[347]:
(32561, 15)
In [356]:
data = np.concatenate([ica_X,pca_X,rp_X,tree_X],axis = 1)
In [15]:
# with open('data.npy', 'wb') as f:
#     np.save(f, np.array([1, 2]))
    

# with open('data.npy', 'wb') as f:
#     np.save(f, data)
with open('data.npy', 'rb') as f:
    data = np.load(f)
In [5]:
ks = [2,4,6,8,15,20,100]

def find_gm_k(ks,datas):
    start = time.time()
    totals = []
    for income_modified_stan in datas:
        total = []
        for k in ks:
            print(k)
            gms = GaussianMixture(n_components=k, random_state=10)
            gms.fit(income_modified_stan)
            total.append(gms.aic(income_modified_stan))
        totals.append(total)
        print(time.time()-start)
    return totals
In [16]:
ica_X = data[:,:5]
pca_X = data[:,5:39]
rp_X = data[:,39:84]
tree_X = data[:,84:]
In [8]:
rp_X.shape
Out[8]:
(32561, 45)
In [43]:
ks = [2,4,8,15,20,100]

ica_gm = find_gm_k(ks,[ica_X,pca_X,rp_X,tree_X,X_stan])
2
4
8
15
20
100
33.34653067588806
2
4
8
15
20
100
118.07514691352844
2
4
8
15
20
100
233.380761384964
2
4
8
15
20
100
321.7229850292206
2
4
8
15
20
100
518.0847475528717
In [30]:
rr = ica_gm[:-2]#.append(aa[0])
rr.append(aa[0])
rr
Out[30]:
[[-1316796.7565531451, -1448171.619027654, -1483881.1136296545],
 [2256590.2594623007, 11165.381506818298, -1230622.8865688927],
 [1873400.0746377571, -1909944.5193862868, -3442015.4603913072],
 [699064.7142881149, -721338.147594099, -1080633.9318312793],
 [404155.31543551694, -4414096.212592142, -5410495.864073624]]
In [71]:
ks = [2,4,20,50,70,100]

ica_gm = find_gm_k(ks,[ica_X,pca_X,rp_X,tree_X,X_stan])
2
4
20
50
70
100
45.97849082946777
2
4
20
50
70
100
270.2927396297455
2
4
20
50
70
100
504.8112483024597
2
4
20
50
70
100
694.8953211307526
2
4
20
50
70
100
1032.2071721553802
In [106]:
ica_gm[-1]
Out[106]:
[404155.31543551694,
 -4414096.212592142,
 -9233085.073935254,
 -14674547.129505802,
 -15383628.7600985,
 -16028291.94375075]
In [44]:
bb = pd.DataFrame(np.array(ica_gm).T,columns=['ica','pca','rp','tree','ori'])
In [70]:
bb.to_pickle('em_metric.pkl')
In [ ]:
 
In [50]:
ax = bb.plot()
ax.set_title('AIC vs Number of Clusters')
ax.set_xticks(range(len(ks)),ks)
ax.set_xlabel('Number of Components')
Out[50]:
Text(0.5, 0, 'Number of Components')
In [162]:
ks = ['2','4','20','50','70','100']
In [163]:
bb.index = ks
In [164]:
ax = bb.plot()
ax.set_title('AIC vs Number of Clusters')
ax.set_xticks(range(len(ks)),ks)
ax.set_xlabel('Number of Components')
Out[164]:
Text(0.5, 0, 'Number of Components')
In [46]:
bb
Out[46]:
ica pca rp tree ori
0 -1.316797e+06 2.256590e+06 1.873400e+06 6.990647e+05 4.041553e+05
1 -1.448172e+06 1.116538e+04 -1.909945e+06 -7.213381e+05 -4.414096e+06
2 -1.496528e+06 -1.457444e+06 -4.094173e+06 -1.281238e+06 -5.132499e+06
3 -1.521207e+06 -2.142826e+06 -5.832862e+06 -2.348631e+06 -8.153969e+06
4 -1.528872e+06 -2.743365e+06 -7.059801e+06 -2.971593e+06 -9.233085e+06
5 -1.538923e+06 -6.851770e+06 -1.105167e+07 -3.916372e+06 -1.602829e+07
In [161]:
bb.plot()
Out[161]:
<matplotlib.axes._subplots.AxesSubplot at 0x155cf3d3bc8>
In [158]:
bb
Out[158]:
ica pca rp tree ori
0 -1.316797e+06 2.256590e+06 1.873400e+06 6.990647e+05 4.041553e+05
1 -1.448172e+06 1.116538e+04 -1.909945e+06 -7.213381e+05 -4.414096e+06
2 -1.496528e+06 -1.457444e+06 -4.094173e+06 -1.281238e+06 -5.132499e+06
3 -1.521207e+06 -2.142826e+06 -5.832862e+06 -2.348631e+06 -8.153969e+06
4 -1.528872e+06 -2.743365e+06 -7.059801e+06 -2.971593e+06 -9.233085e+06
5 -1.538923e+06 -6.851770e+06 -1.105167e+07 -3.916372e+06 -1.602829e+07
In [156]:
bb = pd.read_pickle('em_metric.pkl')
In [95]:
len(ica_gm)
Out[95]:
5
In [107]:
fig, ((ax1, ax2),(ax3,ax4),(ax5,ax6)) = plt.subplots(3, 2, figsize=(12, 8))
# ax1.barh(tree_indices,
#          clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
# ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
# ax1.set_yticks(tree_indices)
# ax1.set_ylim((0, len(clf.feature_importances_)))
ks = [2,4,20,50,70,100]

ax1.plot(ks,ica_gm[0])
ax1.set_title('ICA')
ax2.plot(ks,ica_gm[1])
ax2.set_title('PCA')

ax3.plot(ks,ica_gm[2])
ax3.set_title('RP')

ax4.plot(ks,ica_gm[3])
ax4.set_title('Tree')

ax5.plot(ks,ica_gm[4])
ax5.set_title('Original')

ax6.axis('off')
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                    wspace=0.35)
plt.suptitle(("GaussianMixture AIC Score"),
                    fontsize=14, fontweight='bold')
Out[107]:
Text(0.5, 0.98, 'GaussianMixture AIC Score')
In [162]:
fig, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(12, 8))
# ax1.barh(tree_indices,
#          clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
# ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
# ax1.set_yticks(tree_indices)
# ax1.set_ylim((0, len(clf.feature_importances_)))
ax1.plot(ica_gm[0])
ax2.plot(ica_gm[1])

ax3.plot(ica_gm[2])

ax4.plot(ica_gm[3])
Out[162]:
[<matplotlib.lines.Line2D at 0x2db07f4bcc8>]

after rotation, the data is easier to be clustered, with same number of cluster, it has a larger silhouette score

In [166]:
silhouette(ica_X)
For n_clusters = 2 The average silhouette_score is : 0.5158532306158032
For n_clusters = 4 The average silhouette_score is : 0.3268625913684567
For n_clusters = 6 The average silhouette_score is : 0.41471154415546146
For n_clusters = 8 The average silhouette_score is : 0.37563988793330355
In [40]:
silhouette(pca_X)
For n_clusters = 2 The average silhouette_score is : 0.1029909505792376
For n_clusters = 4 The average silhouette_score is : 0.12478033681479667
For n_clusters = 6 The average silhouette_score is : 0.11866377181636842
For n_clusters = 8 The average silhouette_score is : 0.13266375440315667
In [41]:
silhouette(tree_X)
For n_clusters = 2 The average silhouette_score is : 0.22608574717544416
For n_clusters = 4 The average silhouette_score is : 0.25736427945523
For n_clusters = 6 The average silhouette_score is : 0.284123304525324
For n_clusters = 8 The average silhouette_score is : 0.3003459338586325
In [ ]:
 
In [42]:
silhouette(rp_X)
For n_clusters = 2 The average silhouette_score is : 0.10256048482952247
For n_clusters = 4 The average silhouette_score is : 0.11122606986422909
For n_clusters = 6 The average silhouette_score is : 0.11211388882341368
For n_clusters = 8 The average silhouette_score is : 0.12433235762882561
In [51]:
silhouette(X_stan)
For n_clusters = 2 The average silhouette_score is : 0.08171694863868836
For n_clusters = 4 The average silhouette_score is : 0.0858816062531556
For n_clusters = 6 The average silhouette_score is : 0.09556409402257389
For n_clusters = 8 The average silhouette_score is : 0.07691835098548624
In [150]:
from scipy.spatial.distance import cdist 
K = [2,5,10,20,30,50,100]
def kmeans_elbow(K,X_stan,seed=0):
    distortions = [] 
    inertias = [] 
    silhouette_avg = []
    mapping1 = {} 
    mapping2 = {} 

    for k in K: 
        #Building and fitting the model 
        kmeanModel = KMeans(n_clusters=k,random_state=seed).fit(X_stan) 
        #kmeanModel.fit(X_stan)     

        distortions.append(sum(np.min(cdist(X_stan, kmeanModel.cluster_centers_, 
                          'euclidean'),axis=1)) / X_stan.shape[0]) 
        inertias.append(kmeanModel.inertia_) 
        silhouette_avg.append(silhouette_score(X_stan, kmeanModel.predict(X_stan)))

#         mapping1[k] = sum(np.min(cdist(X_stan, kmeanModel.cluster_centers_, 
#                      'euclidean'),axis=1)) / X_stan.shape[0] 
#         mapping2[k] = kmeanModel.inertia_
    return silhouette_avg,distortions,inertias
In [255]:
start = time.time()
silhouette_score(X_stan, kmeanModel.predict(X_stan))
time.time()-start
Out[255]:
14.486879587173462

ori kmeans elbow

In [66]:
K = [10,40,70,80,90,100]
distortions,inertias = kmeans_elbow(K,X_stan)
In [267]:
K = [2,4,6,8,10]
sih,distortions,inertias = kmeans_elbow(K,X_stan,seed = 10)
In [270]:
plt.plot(K[:-1], sih[:-1], 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Silhouette') 
plt.title('The Elbow Method using Silhouette') 
plt.show()
In [67]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show()
In [68]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('The Elbow Method using Inertia') 
plt.show()

PCA elbow

In [165]:
K = [10,40,70,80,90,100]

#K = [2,4,6,10,20,50,100]
s,distortions,inertias = kmeans_elbow(K,pca_X)
In [116]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('PCA Elbow Method using Distortion') 
plt.show()
In [166]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('PCA Elbow Method using Inertia') 
plt.show()

ICA elbow

In [118]:
K = [10,40,70,80,90,100]

K = [2,4,6,10,20,50,100]
distortions,inertias = kmeans_elbow(K,ica_X)
In [119]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('ICA Elbow Method using Distortion') 
plt.show()
In [110]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('ICA Elbow Method using Inertia') 
plt.show()

RP elbow

In [120]:
K = [10,40,70,80,90,100]

K = [2,4,6,10,20,50,100]
distortions,inertias = kmeans_elbow(K,rp_X)
In [122]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('RP Elbow Method using Distortion') 
plt.show()
In [123]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('RP Elbow Method using Inertia') 
plt.show()
In [237]:
K = [10,40,70,80,90,100]

K = [2,4,6,8,9]#,20,50,100]
sih,distortions,inertias = kmeans_elbow(K,rp_X)
In [238]:
plt.plot(K, sih, 'bx-') 
plt.xlabel('Values of K') 
plt.title('Silhouette Score') 
plt.show()
In [249]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('RP Elbow Method using Distortion') 
plt.show()
In [250]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('RP Elbow Method using Inertia') 
plt.show()
In [ ]:
        silhouette_avg = silhouette_score(X_stan, cluster_labels)

Tree elbow

In [128]:
K = [10,40,70,80,90,100]

K = [2,4,6,10,20,50,100]
distortions,inertias = kmeans_elbow(K,tree_X)
In [129]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('Tree Elbow Method using Distortion') 
plt.show()
In [130]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('Tree Elbow Method using Inertia') 
plt.show()

KMEANS with reduction features

In [69]:
silhouette(X_stan,range_n_clusters=[50,80,100])
For n_clusters = 50 The average silhouette_score is : 0.22396943677646775
For n_clusters = 80 The average silhouette_score is : 0.27493495478032753
For n_clusters = 100 The average silhouette_score is : 0.2896473538229642

smaller feature number means a smaller cluster number

cluster = 2**n

In [72]:
silhouette(ica_X,range_n_clusters=[50,80,100])
For n_clusters = 50 The average silhouette_score is : 0.2994562382214242
For n_clusters = 80 The average silhouette_score is : 0.29225239016152915
For n_clusters = 100 The average silhouette_score is : 0.28577725181782593
In [74]:
silhouette(pca_X,range_n_clusters=[50,80,100])
For n_clusters = 50 The average silhouette_score is : 0.2595750003064212
For n_clusters = 80 The average silhouette_score is : 0.3006041780398309
For n_clusters = 100 The average silhouette_score is : 0.32713705609544386
In [75]:
silhouette(rp_X,range_n_clusters=[50,80,100])
For n_clusters = 50 The average silhouette_score is : 0.20886759596107254
For n_clusters = 80 The average silhouette_score is : 0.2602912364214384
For n_clusters = 100 The average silhouette_score is : 0.28374102972641546
In [ ]:
 
In [76]:
silhouette(tree_X,range_n_clusters=[50,80,100])
For n_clusters = 50 The average silhouette_score is : 0.3037411824279752
For n_clusters = 80 The average silhouette_score is : 0.3116660018760009
For n_clusters = 100 The average silhouette_score is : 0.3067608682521907
In [ ]:
silhouette(X_,range_n_clusters=[50,80,100])

reduction clustering

In [171]:
#gm = GaussianMixture(n_components=9, random_state=10)
kmeans = KMeans(n_clusters=9, random_state=0)
kmeans.fit(ica_X)
Out[171]:
KMeans(n_clusters=9, random_state=0)
In [172]:
kmeans.labels_
Out[172]:
array([1, 6, 1, ..., 7, 6, 2])

EM ICA:20 PCA:50 RP:50 tREE:20 ORI:50

Kmeans

In [19]:
n_comp = [20,50,50,20,50]
combined_data = [ica_X,pca_X,rp_X,tree_X,X_stan]
In [197]:
labels = []
for i in range(5):
    gm = GaussianMixture(n_components=n_comp[i], random_state=0)
    kmeans = KMeans(n_clusters=n_comp[i], random_state=0)
    labels.append(gm.fit_predict(combined_data[i]))
    labels.append(kmeans.fit_predict(combined_data[i]))

getting converged rp_X

In [225]:
labels_rp = []
for i in range(5):
    gauRP = GaussianRandomProjection(random_state=i,n_components=45)
    multi_rp_X = gauRP.fit_transform(X_stan)
    gm = GaussianMixture(n_components=50, random_state=0)
    kmeans = KMeans(n_clusters=50, random_state=0)
    labels_rp.append(gm.fit_predict(multi_rp_X))
    labels_rp.append(kmeans.fit_predict(multi_rp_X))
In [226]:
multi_rp_label = pd.DataFrame(np.array(labels_rp).T)#.head()
In [227]:
order_by_count = []
for i in multi_rp_label.columns:
    order_by_count.append(multi_rp_label[i].value_counts())
In [437]:
ax = pd.DataFrame(np.array(order_by_count).T).iloc[:5,[0,2,3,6,8]].plot.bar(
    title='EM cluster distribution for Random Projection')
ax.set_xlabel('Largest 5 Clusters')
Out[437]:
Text(0.5, 0, 'Largest 5 Clusters')
In [440]:
ax = pd.DataFrame(np.array(order_by_count).T).iloc[:5,[1,3,5,7,9]].plot.bar(
    title='Kmeans cluster distribution for Random Projection')
ax.set_xlabel('Largest 5 Clusters')
Out[440]:
Text(0.5, 0, 'Largest 5 Clusters')
In [446]:
multi_rp_label[multi_rp_label[0]==47][1].value_counts()
Out[446]:
32    149
29     22
33      2
34      1
Name: 1, dtype: int64
In [232]:
multi_rp_label[multi_rp_label[0]==47][4].value_counts()
Out[232]:
39    146
31     26
40      2
Name: 4, dtype: int64
In [ ]:
 
In [ ]:
 
In [198]:
with open('reduction_clustering.npy', 'wb') as f:
    np.save(f, labels)
In [186]:
with open('reduction_clustering.npy', 'rb') as f:
    aa = np.load(f)
In [199]:
labels.append(y)
In [200]:
aa = pd.DataFrame(np.array(labels).T)
aa.to_pickle('reduction_clustering.pkl')
In [11]:
aa = pd.read_pickle('reduction_clustering.pkl')
In [12]:
aa.columns = ['ica_gm','ica_kmeans','pca_gm','pca_means','rp_gm','rp_kmeans','tree_gm','tree_kmeans','ori_gm','ori_kmeans','y']
In [ ]:
 
In [208]:
aa.head()
Out[208]:
ica_gm ica_kmeans pca_gm pca_means rp_gm rp_kmeans tree_gm tree_kmeans ori_gm ori_kmeans y
0 1 14 40 8 47 32 8 8 13 30 0
1 6 13 41 37 8 42 8 8 9 30 0
2 9 14 40 44 47 32 8 8 13 30 0
3 11 2 8 46 12 42 8 8 47 30 0
4 11 11 45 22 30 42 8 8 44 30 0
In [231]:
columns = aa.columns[-1]
In [451]:
columns
Out[451]:
Index(['ica_gm', 'ica_kmeans', 'pca_gm', 'pca_means', 'rp_gm', 'rp_kmeans',
       'tree_gm', 'tree_kmeans', 'ori_gm', 'ori_kmeans'],
      dtype='object')
In [452]:
#fig, (ax1, ax2) = plt.subplots(1, 2)
combined_sort_label = []
columns = aa.columns[:-1]
i = 0
for col in columns:
    i+=1
    bar_x = range(max(aa[col])+1)
    bar_y = aa[col].value_counts().to_numpy()
    combined_sort_label.append(bar_y)
    plt.bar(bar_x, bar_y)
    #plt.xticks(range(max(aa[col])+1),aa[col].value_counts().index)
    if i==2:
        i=0
        plt.title(col[:4]+' GM and Kmeans')
        plt.xlabel('cluster group')
        plt.legend(('GM', 'Kmeans'))

        plt.show()
In [ ]:
 
In [199]:
aa[aa.ica_gm==i].y.value_counts().to_numpy()
Out[199]:
array([240, 233], dtype=int64)
In [201]:
aaica=[]
for i in range(20):
    aaica.append(aa[aa.ica_gm==i].y.value_counts().to_numpy())
In [211]:
df_aa.sort_values(by='per',ascending=True)
Out[211]:
0 1 per
19 240 233.0 1.030043
5 1815 1751.0 1.036551
7 3832 3134.0 1.222719
18 927 724.0 1.280387
10 260 192.0 1.354167
15 337 170.0 1.982353
4 482 189.0 2.550265
14 353 127.0 2.779528
12 957 175.0 5.468571
0 925 115.0 8.043478
16 138 17.0 8.117647
8 627 62.0 10.112903
13 523 44.0 11.886364
2 4304 351.0 12.262108
6 960 75.0 12.800000
1 918 51.0 18.000000
11 4330 208.0 20.817308
17 2532 89.0 28.449438
9 374 13.0 28.769231
3 7 NaN NaN
In [217]:
df_aa = pd.DataFrame(aaica)
df_aa['per'] = df_aa.apply(lambda x:x[0]/x[1] if x[1] != 'NaN' else 0,axis=1)
df_aa.sort_values(by='per',ascending=False).to_csv('sortafter.csv')
In [212]:
aa.hist(figsize=(15,12))
Out[212]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000019C958A3508>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C963332C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C9611C848>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000019C96156288>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C96189C48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C961CAE48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000019C961F8FC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C96233C08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C9623D808>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000019C962769C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C964AEF48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000019C964E5FC8>]],
      dtype=object)

checking how good is the new cluster agree with labels with feature reduction

In [288]:
start = time.time()
sil_score = []
cd = [ica_X,ica_X,pca_X,pca_X,rp_X,rp_X,tree_X,tree_X,X_stan,X_stan]
for i in range(10):
    print(i,time.time()-start)
    data = cd[i]
    sil_score.append(silhouette_score(data,labels[i]))
0 0.0
1 11.801583528518677
2 22.98762845993042
3 35.667176723480225
4 47.49974989891052
5 61.96485757827759
6 79.09589552879333
7 99.37141942977905
8 131.26595759391785
9 165.80618715286255
In [289]:
sil_score
Out[289]:
[0.2440362024997056,
 0.3369307112291577,
 0.20414063801170687,
 0.25578133001924874,
 0.18636166607268803,
 0.2089860049234935,
 0.27519778024260805,
 0.2755217649427409,
 0.20867184960638577,
 0.23208484671194432]
In [292]:
gauRP = GaussianRandomProjection(random_state=10,n_components=45)
rp_X = gauRP.fit_transform(X_stan)

gm_rp = gm.fit_predict(rp_X)
km_rp = kmeans.fit_predict(rp_X)

print(silhouette_score(rp_X,gm_rp),silhouette_score(rp_X,km_rp))
In [299]:
gm.aic(rp_X)
Out[299]:
-9650874.893825468
In [301]:
gm.fit(pca_X)
gm.aic(pca_X)
gm_pca = gm.fit_predict(pca_X)
silhouette_score(pca_X,gm_pca)
Out[301]:
0.20414063801170687
In [296]:
print(silhouette_score(rp_X,gm_rp),silhouette_score(rp_X,km_rp))
0.19535002748390878 0.2272804700099387
In [ ]:
 
In [ ]:
 
In [125]:
rr = pd.DataFrame(np.stack((y.to_numpy(),kmeans_label,gm_label)).T,columns = ['label','kmeans','EM'])
rr.head()
Out[125]:
label kmeans EM
0 0 4 0
1 0 0 0
2 0 4 0
3 0 0 3
4 0 0 2
In [402]:
aa.head()
Out[402]:
ica_gm ica_kmeans pca_gm pca_means rp_gm rp_kmeans tree_gm tree_kmeans ori_gm ori_kmeans y
0 1 14 40 8 47 32 8 8 13 30 0
1 6 13 41 37 8 42 8 8 9 30 0
2 9 14 40 44 47 32 8 8 13 30 0
3 11 2 8 46 12 42 8 8 47 30 0
4 11 11 45 22 30 42 8 8 44 30 0
In [404]:
for i in range(6):
    print(aa[aa.ica_gm==i].y.value_counts())
0    925
1    115
Name: y, dtype: int64
0    918
1     51
Name: y, dtype: int64
0    4304
1     351
Name: y, dtype: int64
0    7
Name: y, dtype: int64
0    482
1    189
Name: y, dtype: int64
0    1815
1    1751
Name: y, dtype: int64
In [407]:
for i in range(6):
    print(aa[aa.ica_kmeans==i].y.value_counts())
0    2413
1      12
Name: y, dtype: int64
1    1961
0     875
Name: y, dtype: int64
0    2016
1     145
Name: y, dtype: int64
0    363
1    129
Name: y, dtype: int64
0    7
Name: y, dtype: int64
0    279
1    203
Name: y, dtype: int64
In [133]:
rr[rr.kmeans==1].EM.value_counts()
Out[133]:
4    11750
3      893
1       91
2        5
0        4
Name: EM, dtype: int64
In [134]:
rr[rr.kmeans==0].EM.value_counts()
Out[134]:
2    6297
1    5690
0    4760
3    1107
Name: EM, dtype: int64
In [139]:
rr[rr.kmeans==4].EM.value_counts()
Out[139]:
2    587
4    489
1    421
0    337
Name: EM, dtype: int64

neural net here

In [132]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):

    if axes is None:
        _, axes = plt.subplots(3, 1, figsize=(5, 20))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Error")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(1-train_scores, axis=1)
    train_scores_std = np.std(1-train_scores, axis=1)
    test_scores_mean = np.mean(1-test_scores, axis=1)
    test_scores_std = np.std(1-test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Error")
    axes[2].set_title("Performance of the model")

    return plt
In [386]:
import numpy as np
from sklearn.model_selection import train_test_split

data = X_stan

X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, random_state=42)
In [148]:
from sklearn.model_selection import GridSearchCV
start = time.time()
param_grid = [
  {'alpha': [0.1,10,100],'hidden_layer_sizes':[(5,),(5,4),(20,),(20,4)]}
]
mlp = MLPClassifier( random_state=1)
search = GridSearchCV(mlp, param_grid, cv=5)
search.fit(X_train,y_train)
time.time()-start
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
Out[148]:
345.1690776348114
In [149]:
search.best_params_
Out[149]:
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
In [137]:
search.best_params_['alpha']
Out[137]:
10
In [152]:
mlp = MLPClassifier( alpha=0.1,#search.best_params_['alpha'],
                    hidden_layer_sizes=(20,),#search.best_params_['hidden_layer_sizes'],
                    random_state=1)
In [153]:
mlp.fit(X_train,y_train)
mlp.score(X_test,y_test)
Out[153]:
0.8487640104406572
In [102]:
def nn_with_reduction(df,params={'alpha': 0.1, 'hidden_layer_sizes': (20,)}
,grid=True):
    start = time.time()

    X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42)

    param_grid = [{'alpha': [0.01,0.1,10],'hidden_layer_sizes':[(20,)]#(5,),(5,4),(20,),(20,4)]
                  }]
    mlp = MLPClassifier( random_state=1)
    search = GridSearchCV(mlp, param_grid, cv=5)
    t=time.time()
    if grid:
        
        search.fit(X_train,y_train)
        t = time.time()-t
        print(search.best_params_)
    else:
        search.best_params_ = params
    mlp = MLPClassifier( alpha=search.best_params_['alpha'],
                    hidden_layer_sizes=search.best_params_['hidden_layer_sizes'],
                    random_state=1)
    mlp.fit(X_train,y_train)
    print(time.time()-start)
    return mlp,mlp.score(X_test,y_test),t
    
In [219]:
rp_m,s = nn_with_reduction(rp_X)
s
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
775.2970283031464
Out[219]:
0.8487640104406572
In [343]:
rp_m,s = nn_with_reduction(rp_X,False,{'hidden_layer_sizes': (40,)})
s
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (5, 4)}
414.35329580307007
Out[343]:
0.8456932289267619
In [158]:
nn_with_reduction(ica_X)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
197.29009866714478
Out[158]:
0.7710732381391064
In [358]:
X_stan[:,:10].shape
Out[358]:
(32561, 10)
In [360]:
ica_X.shape
Out[360]:
(32561, 5)
In [78]:
complete_ica = np.concatenate([X_stan[:,:10],ica_X],axis=1)
In [378]:
mlp_ica,s = nn_with_reduction(complete_ica,{'alpha':0.001,'hidden_layer_sizes': (50,4)},False)
s
14.06903624534607
Out[378]:
0.8441578381698143
In [43]:
mlp_ori,s = nn_with_reduction(X_stan,{'alpha':0.1,'hidden_layer_sizes': (5,4)},False)
s
7.66153621673584
Out[43]:
0.8492246276677414
In [159]:
nn_with_reduction(pca_X)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
419.26055788993835
Out[159]:
0.8489175495163519
In [160]:
nn_with_reduction(X_stan)
{'alpha': 0.1, 'hidden_layer_sizes': (5, 4)}
510.4361500740051
Out[160]:
0.8492246276677414
In [161]:
nn_with_reduction(tree_X)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
463.924654006958
Out[161]:
0.8524489482573315
In [394]:
mlp_tree,s = nn_with_reduction(tree_X,{'alpha':0.1,'hidden_layer_sizes': (5,4)},False)
s
9.8920156955719
Out[394]:
0.8506064793489944
In [49]:
mlp_tree,s = nn_with_reduction(tree_X,{'alpha':0.1,'hidden_layer_sizes': (20,)},False)
s
7.931020498275757
Out[49]:
0.8524489482573315
In [341]:
training_time = []
xlabel = ['ica','tree','pca','ori']
cd =[ica_X,tree_X,pca_X,X_stan]
for d in cd:
    start = time.time()
    mlp.fit(d,y)
    training_time.append(time.time()-start)
plt.title('Training Time vs Feature Reduction')
plt.xlabel('Feature Reduction Method')
plt.ylabel('Time in Seconds')
plt.xticks(range(4),xlabel)
plt.plot(training_time)
Out[341]:
[<matplotlib.lines.Line2D at 0x19ca007aa48>]
In [338]:
training_time
Out[338]:
[6.848608493804932,
 9.562671899795532,
 12.905877351760864,
 6.495079755783081,
 12.889098405838013]
In [ ]:
    start = time.time()
    mlp.fit(ica_X,y)
    training_time.append(time.time()-start)
In [ ]:
 
In [ ]:
rp
In [52]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score


data = tree_X
    
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, random_state=42)

y_pred = mlp_tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      4976
           1       0.73      0.59      0.65      1537

    accuracy                           0.85      6513
   macro avg       0.81      0.76      0.78      6513
weighted avg       0.85      0.85      0.85      6513

In [53]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


data = X_stan
    
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, random_state=42)

y_pred = mlp_ori.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4976
           1       0.72      0.59      0.65      1537

    accuracy                           0.85      6513
   macro avg       0.80      0.76      0.78      6513
weighted avg       0.84      0.85      0.84      6513

In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

part 5 nn with clustering and feature reduction

In [18]:
aa
Out[18]:
ica_gm ica_kmeans pca_gm pca_means rp_gm rp_kmeans tree_gm tree_kmeans ori_gm ori_kmeans y
0 1 14 40 8 47 32 8 8 13 30 0
1 6 13 41 37 8 42 8 8 9 30 0
2 9 14 40 44 47 32 8 8 13 30 0
3 11 2 8 46 12 42 8 8 47 30 0
4 11 11 45 22 30 42 8 8 44 30 0
... ... ... ... ... ... ... ... ... ... ... ...
32556 2 19 44 27 33 14 3 3 12 6 0
32557 18 11 31 21 32 38 6 6 18 37 0
32558 7 10 8 18 12 21 7 7 47 18 1
32559 6 2 41 48 8 49 17 17 9 34 0
32560 2 0 3 41 36 27 18 18 25 7 0

32561 rows × 11 columns

In [464]:
col_names=['ica','pca','rp','tree']
for col in aa.columns:
    df = aa[col]
Out[464]:
(32561, 45)
In [23]:
ori_gm = aa.ori_gm.apply(lambda x:str(x))
ori_kmeans = aa.ori_kmeans.apply(lambda x:str(x))
In [71]:
cluster_features = StandardScaler().fit_transform(pd.get_dummies(ori_gm))
cluster_features.shape
Out[71]:
(32561, 50)
In [69]:
gm_label
kmeans_label
Out[69]:
array([4, 0, 4, ..., 1, 0, 0])
In [76]:
combined_data[0].shape
Out[76]:
(32561, 5)
In [82]:
q5_data = []
for d in combined_data:
    q5_data.append(np.concatenate([d,cluster_features],axis=1))
q5_data[0]=np.concatenate([complete_ica,cluster_features],axis=1)
In [86]:
for q in q5_data:
    print(q.shape)
(32561, 65)
(32561, 84)
(32561, 95)
(32561, 65)
(32561, 109)
In [104]:
q5model=[]
q5score=[]
q5time=[]
for q in q5_data[:-1]:

    model,s,t = nn_with_reduction(q)
    q5model.append(model)
    q5score.append(s)
    q5time.append(t)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
253.4256570339203
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
177.85305261611938
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.01, 'hidden_layer_sizes': (20,)}
186.50175666809082
{'alpha': 0.01, 'hidden_layer_sizes': (20,)}
141.40194463729858
In [167]:
q5score
Out[167]:
[0.841087056655919, 0.8452326116996776, 0.849531705819131, 0.8502994011976048]
In [84]:
q5_data[0]=np.concatenate([complete_ica,cluster_features],axis=1)
In [91]:
q5_data[0].shape
Out[91]:
(32561, 65)
In [105]:
plt.plot(q5score)
Out[105]:
[<matplotlib.lines.Line2D at 0x155cf376348>]
In [181]:
mlp_tree,s,t = nn_with_reduction(q5_data[-1],{'alpha':0.01,'hidden_layer_sizes': (20,)},False)
s
14.27958059310913
Out[181]:
0.8532166436358053
In [182]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


data = q5_data[-1]
model = mlp_tree
    
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, random_state=42)

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      4976
           1       0.71      0.63      0.67      1537

    accuracy                           0.85      6513
   macro avg       0.80      0.78      0.79      6513
weighted avg       0.85      0.85      0.85      6513

In [183]:
mlp_rp_5,s,t = nn_with_reduction(q5_data[-2],{'alpha':0.01,'hidden_layer_sizes': (20,)},False)
s
10.0910484790802
Out[183]:
0.8502994011976048
In [184]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


data = q5_data[-2]
model = mlp_rp_5
    
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, random_state=42)

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot()
#print('precision is {}'.format(precision_score(y_test,y_pred)))
#print('recall is {}'.format(recall_score(y_test,y_pred)))
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4976
           1       0.73      0.58      0.65      1537

    accuracy                           0.85      6513
   macro avg       0.80      0.76      0.78      6513
weighted avg       0.84      0.85      0.84      6513

In [ ]:
 
In [107]:
q4model=[]
q4score=[]
q4time=[]
q4_data = [complete_ica,ica_X,pca_X,tree_X]
for q in q4_data:

    model,s,t = nn_with_reduction(q)
    q4model.append(model)
    q4score.append(s)
    q4time.append(t)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.01, 'hidden_layer_sizes': (20,)}
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
181.59414196014404
{'alpha': 0.01, 'hidden_layer_sizes': (20,)}
177.8792781829834
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
133.92912364006042
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
109.81823134422302
In [168]:
q4score
Out[168]:
[0.8464609243052357,
 0.8048518347919545,
 0.8489175495163519,
 0.8524489482573315]
In [186]:
q5score
Out[186]:
[0.841087056655919, 0.8452326116996776, 0.849531705819131, 0.8502994011976048]

q5 with gm

In [219]:
cluster_features = StandardScaler().fit_transform(pd.get_dummies(ori_gm))
cluster_features.shape
Out[219]:
(32561, 50)
In [220]:
q5_data = []
for d in combined_data:
    q5_data.append(np.concatenate([d,cluster_features],axis=1))
q5_data[0]=np.concatenate([complete_ica,cluster_features],axis=1)
In [221]:
q5model=[]
q5score=[]
q5time=[]
for q in q5_data[:-1]:

    model,s,t = nn_with_reduction(q)
    q5model.append(model)
    q5score.append(s)
    q5time.append(t)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
142.58486032485962
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.1, 'hidden_layer_sizes': (20,)}
213.41890501976013
C:\Users\xiaoyal\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:585: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
{'alpha': 0.01, 'hidden_layer_sizes': (20,)}
207.04242324829102
{'alpha': 0.01, 'hidden_layer_sizes': (20,)}
152.8359534740448
In [224]:
q5score
Out[224]:
[0.841087056655919, 0.8452326116996776, 0.849531705819131, 0.8502994011976048]
In [ ]: